# LLAMA100M = {
#     "architectures": ["LLaMAForCausalLM"],
#     "attention_bias": False,
#     "attention_dropout": 0.0,
#     "bos_token_id": 0,
#     "eos_token_id": 1,
#     "hidden_act": "silu",
#     "hidden_size": 640,
#     "initializer_range": 0.02,
#     "intermediate_size": 1708,
#     "max_position_embeddings": 2048,
#     "max_sequence_length": 1024,
#     "model_type": "llama",
#     "num_attention_heads": 10,
#     "num_hidden_layers": 12,
#     "num_key_value_heads": 10,
#     "pad_token_id": -1,
#     "pretraining_tp": 1,
#     "rms_norm_eps": 1e-06,
#     "rope_scaling": None,
#     "rope_theta": 10000.0,
#     "tie_word_embeddings": False,
#     "transformers_version": "4.39.3",
#     "use_cache": True,
#     "vocab_size": 32100,
# }
LLAMA1B = {
    "vocab_size": 32000,
    "max_position_embeddings": 2048,
    "hidden_size": 2048,
    "intermediate_size": 5461,
    "num_hidden_layers": 24,
    "num_attention_heads": 32,
    "num_key_value_heads": 32,
    "hidden_act": "silu",
    "initializer_range": 0.02,
    "rms_norm_eps": 1e-06,
    "pretraining_tp": 1,
    "use_cache": True,
    "rope_theta": 10000.0,
    "rope_scaling": None,
    "attention_bias": False,
    "attention_dropout": 0.0,
    "return_dict": True,
    "output_hidden_states": False,
    "output_attentions": False,
    "torchscript": False,
    "torch_dtype": None,
    "use_bfloat16": False,
    "tf_legacy_loss": False,
    "pruned_heads": {},
    "tie_word_embeddings": False,
    "chunk_size_feed_forward": 0,
    "is_encoder_decoder": False,
    "is_decoder": False,
    "cross_attention_hidden_size": None,
    "add_cross_attention": False,
    "tie_encoder_decoder": False,
    "max_length": 20,
    "min_length": 0,
    "do_sample": False,
    "early_stopping": False,
    "num_beams": 1,
    "num_beam_groups": 1,
    "diversity_penalty": 0.0,
    "temperature": 1.0,
    "top_k": 50,
    "top_p": 1.0,
    "typical_p": 1.0,
    "repetition_penalty": 1.0,
    "length_penalty": 1.0,
    "no_repeat_ngram_size": 0,
    "encoder_no_repeat_ngram_size": 0,
    "bad_words_ids": None,
    "num_return_sequences": 1,
    "output_scores": False,
    "return_dict_in_generate": False,
    "forced_bos_token_id": None,
    "forced_eos_token_id": None,
    "remove_invalid_values": False,
    "exponential_decay_length_penalty": None,
    "suppress_tokens": None,
    "begin_suppress_tokens": None,
    "architectures": ["LLaMAForCausalLM"],
    "finetuning_task": None,
    "id2label": {0: "LABEL_0", 1: "LABEL_1"},
    "label2id": {"LABEL_0": 0, "LABEL_1": 1},
    "tokenizer_class": None,
    "prefix": None,
    "bos_token_id": 0,
    "pad_token_id": -1,
    "eos_token_id": 1,
    "sep_token_id": None,
    "decoder_start_token_id": None,
    "task_specific_params": None,
    "problem_type": None,
    "_name_or_path": "./configs/llama_1b.json",
    "transformers_version": "4.39.3",
    "max_sequence_length": 1024,
    "model_type": "llama",
}
LLAMA100M = {
    "vocab_size": 32100,
    "max_position_embeddings": 2048,
    "hidden_size": 640,
    "intermediate_size": 1708,
    "num_hidden_layers": 12,
    "num_attention_heads": 10,
    "num_key_value_heads": 10,
    "hidden_act": "silu",
    "initializer_range": 0.02,
    "rms_norm_eps": 1e-06,
    "pretraining_tp": 1,
    "use_cache": True,
    "rope_theta": 10000.0,
    "rope_scaling": None,
    "attention_bias": False,
    "attention_dropout": 0.0,
    "return_dict": True,
    "output_hidden_states": False,
    "output_attentions": False,
    "torchscript": False,
    "torch_dtype": None,
    "use_bfloat16": False,
    "tf_legacy_loss": False,
    "pruned_heads": {},
    "tie_word_embeddings": False,
    "chunk_size_feed_forward": 0,
    "is_encoder_decoder": False,
    "is_decoder": False,
    "cross_attention_hidden_size": None,
    "add_cross_attention": False,
    "tie_encoder_decoder": False,
    "max_length": 20,
    "min_length": 0,
    "do_sample": False,
    "early_stopping": False,
    "num_beams": 1,
    "num_beam_groups": 1,
    "diversity_penalty": 0.0,
    "temperature": 1.0,
    "top_k": 50,
    "top_p": 1.0,
    "typical_p": 1.0,
    "repetition_penalty": 1.0,
    "length_penalty": 1.0,
    "no_repeat_ngram_size": 0,
    "encoder_no_repeat_ngram_size": 0,
    "bad_words_ids": None,
    "num_return_sequences": 1,
    "output_scores": False,
    "return_dict_in_generate": False,
    "forced_bos_token_id": None,
    "forced_eos_token_id": None,
    "remove_invalid_values": False,
    "exponential_decay_length_penalty": None,
    "suppress_tokens": None,
    "begin_suppress_tokens": None,
    "architectures": ["LLaMAForCausalLM"],
    "finetuning_task": None,
    "id2label": {0: "LABEL_0", 1: "LABEL_1"},
    "label2id": {"LABEL_0": 0, "LABEL_1": 1},
    "tokenizer_class": None,
    "prefix": None,
    "bos_token_id": 0,
    "pad_token_id": -1,
    "eos_token_id": 1,
    "sep_token_id": None,
    "decoder_start_token_id": None,
    "task_specific_params": None,
    "problem_type": None,
    "_name_or_path": "./configs/llama_100m.json",
    "transformers_version": "4.39.3",
    "max_sequence_length": 1024,
    "model_type": "llama",
}
